San Francisco crime dataset from Kaggle
Data fields:
In [ ]:
# import all required libraries
library(ggplot2) # Data visualization
library(scales) # Plot scaling
library(lattice)
library(data.table) # Much faster data frames
library(dplyr) # Data aggregation etc.
library(ggmap) # Load maps from OSM, etc.
In [ ]:
train <- fread('../../data/sf-crime/train.csv', header = TRUE)
In [ ]:
str(train)
In [ ]:
train$Dates <- as.POSIXct(train$Dates, "PST")
train$Year <- as.numeric(format(train$Dates, "%Y"))
train$Month <- as.numeric(format(train$Dates, "%m"))
train$Category <- as.factor(train$Category)
train$DayOfWeek <- as.factor(train$DayOfWeek)
train$PdDistrict <- as.factor(train$PdDistrict)
Restrict to year 2010
In [ ]:
#train <- subset(train, Year == 2010)
In [ ]:
agg.cat <- train %>% group_by(Category) %>% summarize(count = n()) %>% arrange(desc(count))
agg.cat$Category <- factor(agg.cat$Category, levels = agg.cat$Category[order(agg.cat$count)]) # order by count
ggplot(agg.cat, aes(x=Category, y=count)) + geom_bar(stat = "identity") + coord_flip() +
theme(axis.ticks = element_blank(), panel.grid.major.y = element_blank())
In [ ]:
# remove invalid coordinates
train <- subset(train, Y < 40)
In [ ]:
summary(train)
In [ ]:
# compute montly count per Category
train.agg <- train %>% group_by(Year, Month, Category) %>% summarize(count=n())
train.agg$Date <- as.POSIXct(paste(train.agg$Year, train.agg$Month, "01", sep = "-")) # set date to first of month
# set monthly count to 0 for missing values
alldates <- data.frame(Date=with(train.agg, seq(min(Date), max(Date), by="month")))
allcatdates <- merge(alldates, data.frame(Category=levels(train$Category)))
train.agg <- merge(train.agg, allcatdates, by=c("Date", "Category"), all = TRUE)
train.agg[is.na(train.agg$count)]$count <- 0
In [ ]:
# plot montly count per Category
breaks <- seq(as.POSIXct("2003-01-01"), as.POSIXct("2015-01-01"), by="2 years")
ggplot(train.agg, aes(x = Date, y = count, group = Category, col = Category)) +
geom_line(size=0.2) +
facet_wrap(~Category, ncol = 5, scales = "free_y") +
theme(legend.position = "none",
axis.text = element_text(size = 4),
strip.text = element_text(size = 5)) +
scale_y_continuous(limits = c(0, NA)) +
scale_x_datetime(breaks = breaks, labels = format(breaks, "%Y")) +
labs(title="Monthly Frequency of Crime Events per Category")
In [ ]:
train <- subset(train, Y < 40)
train <- subset(train, Year == 2010)
sfMap <- get_map("San Francisco", zoom = 12, source="osm", color = "bw")
In [ ]:
ggmap(sfMap) +
geom_point(data = train, aes(x = X, y = Y, col = PdDistrict), size = 0.1, alpha = 0.3) +
# facet_wrap(~PdDistrict, ncol = 4) +
theme(legend.position = "none", axis.text = element_text(size = 5))
In [ ]: